## Code that creates figure with results from alternative estimators
## Last changed: 2025-05-23

sink("C:/Userdata/Shared/Dofiles/DoAnalysis/JanKalle/ReplicationFiles/AnalysisScripts/Output/FigureA3A4.txt", split=TRUE)

library(PSweight)
library(cobalt)
library(MatchIt)
library(distances)
library(moreparty)
library(party)
library(caret)
library(bonsai)
library(randomForest)
library(tidymodels)
library(tidyverse)
library(rmarkdown)
library(tinytex)
library(tibble)
library(knitr)
library(car)
library(fst)
library(stargazer)
library(pryr)
library(stringr)
library(fst)
library(lubridate)
library(data.table)
library(summarytools)
library(ggplot2)
library(cowplot)
library(ggpubr)
library(haven)
library(broom)
library(fixest)
library(modelsummary)
library(gt)
library(MASS)
library(survey)
library(collapse)
library(readxl)
library(Hmisc)
library(DescTools)
library(ranger)
library(stablelearner)
library(future.apply)
library(lme4)
library(partykit)
library(ggiplot)


# Specify the name of the treatment variable
  tvar <- "T94_90"

# Specify the the birth cohorts to be used for the analysis
  lbyr <- 1942
  ubyr <- 1967

# Specify if only include Swedish citizens 
  citizen <- 1

# Specify the propensity score specification
  psformula <- formula(T94 ~
                         Kon + DAStod_1982 + DAStod_1983 + DAStod_1984 +
                         DAStod_1985 + DAStod_1986 + DAStod_1987 + DAStod_1988 +
                         DAStod_1989 +
                         DAntBarn_91 + Sambo_91 + InvBak  |
                         SSYK3_90 + SNI2_91 + Kommun_91 +
                         rCARB_1982 + rCARB_1983 + rCARB_1984 + rCARB_1985 +
                         rCARB_1986 + rCARB_1987 + rCARB_1988 + rCARB_1989 +
                         pArbInk_1990 + pArbInk_1991 + UtbAr_91 +
                         FodAr + Nom82 + Nom85 + Nom88 + Nom91)
  
  Models <-  vector('list', 8)
  j <- 1

# Run models both for entire sample and highest SES quarter   
  q4ses <- c(0, 1)
    
for(q4 in q4ses) {  
  # Read in the data sets
  NomVald <- read_fst("E:/ProjData/Jan_Kalle/JoPReplications/DB_NomVald.fst", as.data.table=T)
  RTB <- read_fst("E:/ProjData/Jan_Kalle/JoPReplications/DB_RTB.fst", as.data.table=T)
  db <- read_fst("E:/ProjData/Jan_Kalle/JoPReplications/DB_Nom.fst", as.data.table=T)
  
  db[, T94 := get(tvar)]
  mdb <- db[between(FodAr, lbyr, ubyr) & (SvMedb82_18 >= citizen) & AllaVal==1,]
  rm(db)
  gc()
  
  # Split the in SES quartiles
  if(q4 == 1) {
    mdb[!is.na(avSES3), ravSES := cut_number(avSES3, 4, labels = FALSE)]
    mdb <- mdb[ravSES==4]
  }
  
# Use logit model to estimate propensity scores
  mdb <- mdb[complete.cases(mdb),]
  tm <- proc.time()
  logmod <- feglm(psformula,  
                data = mdb, family="binomial")
  proc.time()-tm
  summary(logmod)

# Extract predicted probability (propensity scores)
  if(logmod$nobs == logmod$nobs_origin){ 
    mdb[, pT94 := predict(logmod, type = "response")]
  }else{
    mdb[logmod$obs_selection$obsRemoved, pT94 := predict(logmod, type = "response")]
  }

# Use the PSWeight library to compute different type of propensity score weights  
  mdb <- mdb[complete.cases(mdb),]
  tmp <- proc.time()
  pw <- SumStat(ps.estimate = mdb[, pT94], zname = "T94", 
              xname =c("LopNr"), data = mdb[, .(pT94, T94, LopNr)], 
              weight = c("IPW", "overlap", "treated"))
proc.time()-tmp

  mdb[, a_ipw := pw$ps.weights$IPW]
  mdb[, a_ow := pw$ps.weights$overlap]
  mdb[, a_attw := pw$ps.weights$treated]

  nnmdb <- mdb[, .(LopNr, T94, pT94)]
  nnmdb <- nnmdb[complete.cases(nnmdb),]


## Do 1:1 nearest neighbor matching with replacement. Unfortunately MatchIt 
## is very slow with large data. So here we do the matching with   
## the nearest_neighbor_search function from Fredrik Sävjes 'distances' package.
## See https://dabblingwithdata.amedcalf.com/2022/11/20/a-super-fast-and-flexible-near-optimal-matching-method-using-the-quickmatch-library-in-r/
## for a blogpost that describe how to use this function for this purpose. 
## The search is extremely fast, and it provides the same matches as MatchIt
## unless their are tied propensity scores, then the functions seems to 
## break ties in different ways (might just be a seed issue).

  set.seed(2534)

# Create a distance object based on the propensity scores
  mydist <- distances(nnmdb[, .(LopNr, pT94)],
                    dist_variables = "pT94")

# Calculate the nn
  nns <- nearest_neighbor_search(
    distances = mydist,
    k = 1,
    query_indices = which(nnmdb$T94 == 1),
    search_indices = which(nnmdb$T94 == 0)
  )

# Extract the matches  
  nn_data <- as.data.table(t(nns), keep.rownames=T)
  setnames(nn_data, names(nn_data), c("T", "C1"), skip_absent=T)
  nn_data[, T := as.integer(T)]

# Extract the controls and create the ATT weights, which are 
# simply the number of times an individual is paired as a control, but
# scaled to sum to the number of uniquely matched control units 
# (see https://r.iq.harvard.edu/docs/matchit/2.4-20/How_Exactly_are.html).

  w_db <- cbind(nn_data, nnmdb[nn_data$C1, .(LopNr)])
  setnames(w_db, "LopNr", "C1_LopNr")

  w_db <- cbind(w_db, nnmdb[nn_data$T, .(LopNr)])
  setnames(w_db, "LopNr", "T_LopNr")

  wc_db <- w_db[, .(C1_LopNr)]
  wc_db[, AntKont := .N, by = C1_LopNr]
  wc_db <- unique(wc_db[, .(AntKont, C1_LopNr)]) #Extract unique control individuals
  wc_db[, nnw := AntKont/mean(AntKont)] #Scale by dividing with the mean number of times the ind are used as controls
  setnames(wc_db, "C1_LopNr", "LopNr")

  wt_db <- w_db[, .(T_LopNr)]
  wt_db[, nnw := 1]
  setnames(wt_db, "T_LopNr", "LopNr")

  w_db <- rbindlist(list(wt_db, wc_db), use.names=T, fill=T)
  rm(nns, nn_data, wc_db, wt_db)

  mdb <- mdb[complete.cases(mdb),]

# Now use the Full matching approach of Sävje et al. to obtain ATT and ATE weights.  

  tmp <- proc.time()
  set.seed(7892)
  m.out <- matchit(T94~1, data = mdb,
                 distance = mdb$pT94, method = "quick", estimand = "ATT")
  proc.time()-tmp

  m.data1 <- as.data.table(match.data(m.out))[, .(LopNr, weights)]
  setnames(m.data1, "weights", "gm_att")

  mdb <- m.data1[, .(LopNr, gm_att)][mdb, on = "LopNr"]
  mdb <- w_db[mdb, on = "LopNr"]

  wdb <- mdb[, .(LopNr, a_attw, a_ow, gm_att, FodAr, Kon, T94, pT94, nnw)]
  
  gc()
  RTB <- mdb[RTB, on = "LopNr"]

  rm(mdb)
  gc()

  rm(list=setdiff(ls(), c("NomVald", "RTB", "rtbvar", "Models", "j", "q4ses", "q4", "tvar", "lbyr", "ubyr", "citizen", "psformula")))
  gc()

  mdb <- RTB[!is.na(a_ipw), .(Kon, DAStod_1982, DAStod_1983, DAStod_1984,
                 DAStod_1985, DAStod_1986, DAStod_1987, DAStod_1988,
                 DAStod_1989,
                 DAntBarn_91, Sambo_91, InvBak,
                 SSYK3_90, SNI2_91,
                 rCARB_1982, rCARB_1983, rCARB_1984, rCARB_1985,
                 rCARB_1986, rCARB_1987, rCARB_1988, rCARB_1989,
                 pArbInk_1990, pArbInk_1991, UtbAr_91, Kommun_91,
                 FodAr, Nom82, Nom85, Nom88, Nom91, Nom, T94, Ar, LopNr, a_attw, a_ow, gm_att, nnw)]

  gc()

  mdb[, Nom := Nom*100]

  rm(RTB)
  gc()
  
  Models[[1+q4*4]] <- feols(Nom~T94+i(Ar, T94, ref=1991) + Kon + DAStod_1982 + DAStod_1983 + DAStod_1984 +
                              DAStod_1985 + DAStod_1986 + DAStod_1987 + DAStod_1988 +
                              DAStod_1989 + DAntBarn_91 + Sambo_91 + InvBak | Ar + UtbAr_91 + FodAr + 
                              SSYK3_90 + SNI2_91 +
                              rCARB_1982 + rCARB_1983 + rCARB_1984 + rCARB_1985 +
                              rCARB_1986 + rCARB_1987 + rCARB_1988 + rCARB_1989 +
                              pArbInk_1990 + pArbInk_1991 + Kommun_91, cluster ="LopNr", 
                       mdb[])
  
  print(summary(Models[[1+q4*4]]))
  
  Models[[2+q4*4]] <- feols(Nom~T94+i(Ar, T94, ref=1991) | Ar, cluster ="LopNr", 
              weights = mdb[, a_attw], 
              mdb[])
  
  print(summary(Models[[2+q4*4]]))
  
  Models[[3+q4*4]] <- feols(Nom~T94+i(Ar, T94, ref=1991) | Ar, cluster ="LopNr", 
                       weights = mdb[, a_ow], 
                       mdb[])
  
  print(summary(Models[[3+q4*4]]))

  Models[[4+q4*4]] <- feols(Nom~T94+i(Ar, T94, ref=1991) | Ar, cluster ="LopNr", 
                       weights = mdb[, nnw], 
                       mdb[])
  
  print(summary(Models[[4+q4*4]]))
  gc()
  
}
  
  valar <- c(1982, 1985, 1988, 1991, 1994, 1998, 2002, 2006, 2010, 2014, 2018, 2022)

  titel <- c("Unmatched, covariate controls", "Inverse Probability Weighting",
             "Overlap Weighting", "1:1 Nearest Neighbor Matching")

  
## Now create the plots based on the regression results. 
for(i in 1:4) {
  gi <- ggiplot(Models[[i]]) 
  
  gp <- ggplot(gi$data, aes(x, estimate)) +
    geom_point() + 
    geom_hline(yintercept=0, linetype="longdash") +
    geom_errorbar(aes(ymin=ci_low,ymax=ci_high), width=0.4, size=.6) +
    geom_line(linetype="dotted") +
    scale_x_continuous(breaks = valar,  labels = valar, limits = c(1981, 2023)) +
    #scale_y_continuous(limits = c(-0.1, .3), breaks = seq(-0.1, .3, by = .05)) +
    xlab("Election Year") +
    ylab("Effect on Candidacy") +
    ggtitle(titel[i]) +
    theme_classic(base_size = 12) +
    theme(legend.title=element_blank()) +
    theme(legend.position = "bottom", panel.grid.major.y = element_line(color = "grey95"))

  gp
  assign(paste0("gp", i), gp)
}  
  
  ggarrange(gp2, gp3, gp4, gp1)
  ggsave("C:/Userdata/Shared/Dofiles/DoAnalysis/JanKalle/ReplicationFiles/AnalysisScripts/Figures/FigureA3.pdf", width = 15, height =10, units = "in") 
  
  for(i in 1:4) {
    gi <- ggiplot(Models[[i+4]]) 
    
    gp <- ggplot(gi$data, aes(x, estimate)) +
      geom_point() + 
      geom_hline(yintercept=0, linetype="longdash") +
      geom_errorbar(aes(ymin=ci_low,ymax=ci_high), width=0.4, size=.6) +
      geom_line(linetype="dotted") +
      scale_x_continuous(breaks = valar,  labels = valar, limits = c(1981, 2023)) +
      #scale_y_continuous(limits = c(-.3, 1), breaks = round(seq(-.3, 1, by = .1), digits=1)) +
      xlab("Election Year") +
      ylab("Effect on Candidacy") +
      ggtitle(titel[i]) +
      theme_classic(base_size = 12) +
      theme(legend.title=element_blank()) +
      theme(legend.position = "bottom", panel.grid.major.y = element_line(color = "grey95"))
    
    gp
    assign(paste0("gp", i), gp)
  }  
  
  ggarrange(gp2, gp3, gp4, gp1)
  ggsave("C:/Userdata/Shared/Dofiles/DoAnalysis/JanKalle/ReplicationFiles/AnalysisScripts/Figures/FigureA4.pdf", width = 15, height =10, units = "in") 
  
  modelsummary(Models[c(2, 3, 4, 1)], output = "C:/Userdata/Shared/Dofiles/DoAnalysis/JanKalle/ReplicationFiles/AnalysisScripts/Tables/TableFigA3.tex")
  modelsummary(Models[c(6, 7, 8, 5)], output = "C:/Userdata/Shared/Dofiles/DoAnalysis/JanKalle/ReplicationFiles/AnalysisScripts/Tables/TableFigA4.tex")
  
  sink()